#!/bin/bash

# Generate real quantized weights (INT4) with temperature 
python -m awq.entry --model_path /home/yeq6/Research_project/llama/llama-2-7b-chat_hf \
--w_bit 4 --q_group_size 128 --load_awq llm-awq/awq_cache/llama2-7b-w4-g128_mine.pt \
--q_backend real --dump_quant quant_cache/llama-2-7b-chat-w4-g128_mine_temp_5.pt --temp 5

# test ntk awq with temperature 
python perplexity_yarn.py --tokenized output/govreport-test-tokenized-awq \
--dataset-min-tokens 20384 --samples 10 --output-file ppl_results/govreport_awq_int4_temp_5.csv \
--min-tokens 256 --max-tokens 19456 -m /home/yeq6/Research_project/llama/llama-2-7b-chat_hf \
--quant_path /home/yeq6/Research_project/llama/llm-awq/quant_cache/llama-2-7b-chat-w4-g128_mine_temp_5-v2.pt \
--sliding-window 2048 --tokens-step 384 --aggressive-memory --awq --ntk 8

# test yarn awq with temperature
python perplexity_yarn.py --tokenized output/govreport-test-tokenized-awq \
--dataset-min-tokens 20384 --samples 10 --output-file ppl_results/govreport_awq_int4_yarn_temp_5.csv \
--min-tokens 256 --max-tokens 19456 -m /home/yeq6/Research_project/llama/llama-2-7b-chat_hf \
--quant_path /home/yeq6/Research_project/llama/llm-awq/quant_cache/llama-2-7b-chat-w4-g128_mine_temp_5-v2.pt \
--sliding-window 2048 --tokens-step 384 --aggressive-memory --yarn 8 --original --custom-model \
--original-max-position-embeddings 2048 --awq 

# only rescale embedding with 2048 og context size new implementation
python perplexity_yarn.py --tokenized output/govreport-test-tokenized-awq \
--dataset-min-tokens 20384 --samples 10 \
--output-file ppl_results/new_embedding_only \
--min-tokens 256 --max-tokens 19456 -m /home/yeq6/Research_project/llama/llama-2-7b-chat_hf \
--awq_cache llm-awq/awq_cache/llama2-7b-w4-g128_mine.pt \
--awq_rescale_temp 5 --beta_point 1440 --sliding-window 2048 --tokens-step 384 --aggressive-memory --yarn 8 \
--original --custom-model --original-max-position-embeddings 2048 --awq

# Dynamic rescale embedding with log distance
python perplexity_yarn.py --tokenized output/govreport-test-tokenized-awq \
--dataset-min-tokens 20384 --samples 10 \
--output-file ppl_results/new_embedding_only \
--min-tokens 256 --max-tokens 19456 -m /home/yeq6/Research_project/llama/llama-2-7b-chat_hf \
--awq_cache llm-awq/awq_cache/llama2-7b-w4-g128_mine.pt \
--dynamic_with_log_distance --sliding-window 2048 --tokens-step 384 --aggressive-memory --yarn 8 \
--original --custom-model --original-max-position-embeddings 2048 --awq --comments dynamic_with_log_distance